This document includes the analysis of simple and complex satellite abundances in different Black/6 and Black/10 mouse strains, and uses the written history (generations at splits along the “phylogeny”) to calculate rates of copy number change.

First, load in and process the data

data <- read.table("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_MA_30x_trimmed_data_kseek.rep.normGC", header=T)
#data[1:5, 1:5]
nrow(data)
## [1] 13749
#colnames(data)

#need to flip the data frame since it's in the opposite orientation
data2 <- data.frame(t(data[-1]))
# match the file names to the names of the lines  
#data2[, 1:5]
colnames(data2) <- data[,1]

line_names <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_line_names.csv", header=F)
line_files <- as.vector(line_names[,1])
line_names <- as.vector(line_names[,2])
names(line_names) <- line_files
line_names_df <- as.data.frame(line_names)

data_withnames <- merge (data2, line_names, by=0)
lines <- as.vector(data_withnames[,"y"])
data_withnames[,"y"] <- NULL
data_withnames[,1] <- NULL
rownames (data_withnames) <- lines
#data_withnames[,1:3]
#data2[,1:5]

# simplify the kmer names 
kmer.labels <- sapply(strsplit(colnames(data_withnames), "\\/"), '[', 1)
colnames(data_withnames) <- kmer.labels

# now calculate the means and sort the data
kmer_means <- colMeans(data_withnames)
data_sorted <- data_withnames[,order(kmer_means, decreasing=T)]  

Next, select the kmers to focus on based on abundance, and plot PCAs

# first use common kmers, which have an average abundance of at least 100
get_common_kmers <- function (kmer_matrix) {
  common.kmer.indexes <- c()
  for (i in 1:ncol(kmer_matrix)) {
    if (mean(kmer_matrix[,i])>=100) {
      common.kmer.indexes <- c(common.kmer.indexes, i)
    }
  }
  names(common.kmer.indexes) <- colnames(kmer_matrix)[common.kmer.indexes]
  return (common.kmer.indexes)
}
data_sorted_common <- data_sorted[,get_common_kmers(data_sorted)]
#data_sorted_common[,1:5]
ncol(data_sorted_common)
## [1] 63
AG_repeats <- c("AG", "AAG", "AAAG", "AGG", "AAGGAG", "AAGG", "AAAGG", "AAGAG", "AAGGG", "AGAGG", "AAAAAG", "AAAGAG", "AAAGGG", "AAAAAG", "AGAGGG", "AAGAGG", "AAAGGAAG", "AGGG")
data_sorted_common[,AG_repeats]
##                          AG      AAG      AAAG       AGG   AAGGAG     AAGG
## BL6/NTac           67256.45 26347.52  7296.452  8533.097 2418.486 2198.468
## BL6/NCrl           69750.00 38055.28 10579.327  8329.694 2560.842 2322.308
## BL6/NHsd           48925.53 16230.10  5543.591  7846.022 1587.977 1526.284
## BL6N-TyrCBrdCrlCrl 70561.23 38934.90 10379.190  9185.140 2827.708 2397.239
## BL6/JBomTac        66683.30 32800.02  9477.913  8461.805 2386.488 2309.149
## BL6/J              60032.36 27832.02  7200.989  9617.130 2744.191 2227.678
## BL10/ScSnJ         64246.86 40959.64  8851.526 10087.380 2986.484 2297.634
## BL10/SnJ           62270.60 42314.92 10284.460  9944.040 2981.819 2302.539
## BL10/ScCr          50443.03 17614.41  5509.765 10019.725 2079.948 1839.200
## BL6/NJ             63688.51 39818.80  9769.525  9866.128 2859.624 2327.099
## BL10/J             66349.53 41738.66 10232.616  8473.754 2656.981 2079.099
## BL6/ByJ            65096.19 36217.82 10223.577  8199.549 2427.937 2149.029
## BL6/JEiJ           62512.38 33133.06  9364.256  8260.159 2339.655 2148.763
## BL10/ScNHsd        63805.98 38974.22 10193.485  7739.205 2386.076 2139.598
##                        AAAGG     AAGAG     AAGGG    AGAGG   AAAAAG
## BL6/NTac           1736.9611 1042.5865  999.5002 535.6924 245.2341
## BL6/NCrl           1881.9835 1211.2068  960.0620 565.0340 219.2628
## BL6/NHsd            875.7659  624.7763  835.4631 449.1689 244.1678
## BL6N-TyrCBrdCrlCrl 1950.5955 1203.5164 1104.6945 629.0574 237.9968
## BL6/JBomTac        1835.7438 1230.6188  944.1000 588.7770 226.2638
## BL6/J              1656.7825 1041.2083  974.7886 573.7315 247.8997
## BL10/ScSnJ         1869.7141 1199.3150 1018.3826 583.8824 229.5712
## BL10/SnJ           1925.2236 1227.2337  983.7501 579.4867 246.9304
## BL10/ScCr           992.8838  685.8580 1007.0211 540.3200 268.6505
## BL6/NJ             1897.0839 1204.7733 1020.5747 644.0331 228.0036
## BL10/J             1762.1477 1211.4311  799.3172 492.8769 271.2413
## BL6/ByJ            1781.3128 1106.8807  881.1411 512.5859 248.5310
## BL6/JEiJ           1737.5669 1089.5450  843.0328 528.1448 258.9078
## BL10/ScNHsd        1792.5522 1162.6397  830.4409 536.9965 216.8835
##                      AAAGAG   AAAGGG AAAAAG.1   AGAGGG    AAGAGG  AAAGGAAG
## BL6/NTac           263.2982 251.1064 245.2341 223.9919 139.93000 122.36293
## BL6/NCrl           360.0023 253.2182 219.2628 198.2796 154.23436 152.96323
## BL6/NHsd           157.4740 191.8332 244.1678 184.2827  59.50754  60.94818
## BL6N-TyrCBrdCrlCrl 381.7134 297.4409 237.9968 235.0870 170.15005 139.63608
## BL6/JBomTac        359.0054 247.6448 226.2638 221.1700 132.16278 142.68280
## BL6/J              278.0870 294.7840 247.8997 187.6473 128.91459 114.64040
## BL10/ScSnJ         369.5459 297.4225 229.5712 191.3634 136.97390 135.56400
## BL10/SnJ           417.5889 312.8885 246.9304 175.6377 138.50373 138.72122
## BL10/ScCr          164.5873 247.0828 268.6505 206.0989  71.88263  60.08225
## BL6/NJ             417.2335 305.4141 228.0036 210.1792 151.32141 136.65068
## BL10/J             399.0728 264.1721 271.2413 208.6264 127.90973 144.12123
## BL6/ByJ            354.3842 240.8161 248.5310 195.0101 132.82618 143.98770
## BL6/JEiJ           320.7037 235.4233 258.9078 201.1888 118.99387 129.50019
## BL10/ScNHsd        338.5952 212.5425 216.8835 216.0247 124.51339 147.25015
##                        AGGG
## BL6/NTac           142.7499
## BL6/NCrl           123.2662
## BL6/NHsd           119.5792
## BL6N-TyrCBrdCrlCrl 146.3483
## BL6/JBomTac        133.7977
## BL6/J              122.2764
## BL10/ScSnJ         135.3885
## BL10/SnJ           125.6165
## BL10/ScCr          151.9664
## BL6/NJ             133.4303
## BL10/J             117.8604
## BL6/ByJ            110.3731
## BL6/JEiJ           132.9163
## BL10/ScNHsd        117.4796
sum(kmer_means[AG_repeats])
## [1] 125050.6
AC_repeats <- c("AC", "ACC", "AACC", "AAAAAC", "AAAACC", "AAAAACC", "AAACC")
sum(kmer_means[AC_repeats])
## [1] 192392.3
which(colnames(data_sorted_common)=="AGGG")
## [1] 51
# now make a PCA using these 63 kmers

pca.data <- prcomp(data_sorted_common, scale.=T)
library(ggbiplot)
## Loading required package: ggplot2
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2020a.
## 1.0/zoneinfo/America/New_York'
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)  

g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8,choices=c(1,3), labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

# do a PCA using more data - see if there is separation. 

get_legit_kmer_indexes <- function (kmer_mx) {
  indexes <- c()
  for (i in 1:ncol(kmer_mx)) { ## go through each column of the data
    p <- kmer_mx[,i] >= 10 ## at least 10 copies
    if (sum(p) >= 1) { # if one sample have at least 2 copies
      indexes <- c(indexes, i)
    }
  }
  return(indexes)
}
legit_indexes <- get_legit_kmer_indexes(data_sorted)
length(legit_indexes)
## [1] 434
# 434 kmers have at least 10 copies in at least one sample - use these now for the PCA. 
legit_data <- data_sorted[,legit_indexes]

# change the name
# colour them differently

rownames(legit_data)[4] <- "BL6/NTyrCBrdCrlCrl" 

strain <- sapply(strsplit(as.vector(rownames(legit_data)), "\\/"), '[', 1)


pca.data <- prcomp(legit_data, scale.=T)

g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

# here's some separation
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = strain, var.axes = F, alpha=0.8, choices=c(1,3))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, choices=c(1,3), labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

Get the amount of satellites in base pairs in each line

get_abskmer_mx <- function (kmer_mx, indexes) {
  kmers.abs <- matrix(nrow=nrow(kmer_mx), ncol=max(indexes))
  for (i in indexes) {
    for (j in 1:nrow(kmer_mx)) {
    kmers.abs[j,i] <- kmer_mx[j,i] * (nchar(colnames(kmer_mx)[i], type="chars")) 
    }
  }
  kmers.abs <- kmers.abs[,colSums(is.na(kmers.abs)) != nrow(kmers.abs)]# get rid of columns that are all NAs
  rownames(kmers.abs) <- rownames(kmer_mx)
  colnames(kmers.abs) <- colnames(kmer_mx)
  return(kmers.abs)
}

# get total kmer content per MA line
get_total_kmercontent <- function (abs_mx) {
  total.kmers.abs <- c()
  for (i in 1:nrow(abs_mx)){
    total.kmers.abs[i] <- sum(abs_mx[i,])/10^6
  }
  names(total.kmers.abs) <- rownames(abs_mx)
  return (total.kmers.abs)
}

data_abs <- get_abskmer_mx(legit_data, c(1:ncol(legit_data)))
total_kmer_content <- get_total_kmercontent(data_abs)
total_kmer_content # note, this is in MB
##           BL6/NTac           BL6/NCrl           BL6/NHsd 
##           1.222917           1.288536           1.073765 
## BL6/NTyrCBrdCrlCrl        BL6/JBomTac              BL6/J 
##           1.393987           1.270839           1.388050 
##         BL10/ScSnJ           BL10/SnJ          BL10/ScCr 
##           1.472052           1.469211           1.179894 
##             BL6/NJ             BL10/J            BL6/ByJ 
##           1.469178           1.274572           1.285923 
##           BL6/JEiJ        BL10/ScNHsd 
##           1.215779           1.250947
boxplot(total_kmer_content, las=2)

Bring in the tree and visualize how kmer abundance evolves along the tree

library(phytools)
## Loading required package: ape
## Loading required package: maps
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
mouse_tree <- read.tree("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/manual_ext.tre")
plot(mouse_tree)

# need to rename so they match the tree

new_rownames <- gsub("/", "", names(total_kmer_content))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2

names(total_kmer_content) <- new_rownames2
#total_kmer_content

obj<-contMap(mouse_tree,total_kmer_content,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

Get kmer lengths and GC content

Calc_kmer_length <- function (kmer_string) {
  return (nchar(kmer_string))
}
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.3.3
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plyr':
## 
##     rename
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.3.3
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:plyr':
## 
##     desc
## Loading required package: XVector
## Warning: package 'XVector' was built under R version 3.3.3
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:plyr':
## 
##     compact
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
Calc_GC_content <- function (kmer_string) {
  string <- BString(kmer_string)
  gc.content <- c(letterFrequency(string, letters=c("CG"), as.prob=F)/(length(string)))
  return(gc.content)
}
common.kmers <- colnames(data_sorted_common)

common.kmer.lengths <- sapply(common.kmers, Calc_kmer_length)
names(common.kmer.lengths) <- common.kmers
#common.kmer.lengths[1:10]

common.kmer.gc <- sapply(common.kmers, Calc_GC_content)
names(common.kmer.gc) <- common.kmers
#common.kmer.gc[1:10]

hist(common.kmer.lengths, breaks=20)

summary(common.kmer.lengths) # median = 6
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   4.000   6.000   6.524   6.000  19.000
length(which(common.kmer.lengths==4)) # 11 
## [1] 11
length(which(common.kmer.lengths==5)) #8 
## [1] 8
length(which(common.kmer.lengths==6)) # 22
## [1] 22
length(common.kmer.lengths)
## [1] 63
hist(common.kmer.gc, breaks=10)

summary(common.kmer.gc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3333  0.5000  0.4456  0.5895  0.7500
# look at legit kmers now
legit.kmers <- colnames(legit_data)
legit.kmer.lengths <- sapply(legit.kmers, Calc_kmer_length)
names(legit.kmer.lengths) <- legit.kmers

legit.kmer.gc <- sapply(legit.kmers, Calc_GC_content)
names(legit.kmer.gc) <- legit.kmers


hist(legit.kmer.lengths, breaks=20)

hist(legit.kmer.gc, breaks=10)

# note: telomere satellite is common.kmers[4]:
common.kmers[4]
## [1] "AACCCT"
total_kmer_content
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         1.222917         1.288536         1.073765         1.393987 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         1.270839         1.388050         1.472052         1.469211 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         1.179894         1.469178         1.274572         1.285923 
##           B6JEiJ        B10ScNHsd 
##         1.215779         1.250947

Boxplot of abundant kmers

abun_data <- c()
for (i in 1:ncol(data_sorted_common)) {
  abun_data <- c(abun_data, data_sorted_common[,i])
}

kmer_names <- c()
for (i in 1:ncol(data_sorted_common)) {
  kmer_names <- c(kmer_names, c(rep(colnames(data_sorted_common)[i], times = nrow(data_sorted_common))))
}

rownames(data_sorted_common)[4] <- "BL6/NTyrCBrdCrlCrl" 
abun_df <- data.frame(rownames(data_sorted_common), kmer_names, abun_data)
abun_df[1:15, ]
##    rownames.data_sorted_common. kmer_names abun_data
## 1                      BL6/NTac         AC 185511.68
## 2                      BL6/NCrl         AC 182274.28
## 3                      BL6/NHsd         AC 182867.67
## 4            BL6/NTyrCBrdCrlCrl         AC 192672.86
## 5                   BL6/JBomTac         AC 188531.48
## 6                         BL6/J         AC 202091.49
## 7                    BL10/ScSnJ         AC 202385.62
## 8                      BL10/SnJ         AC 199890.74
## 9                     BL10/ScCr         AC 204310.46
## 10                       BL6/NJ         AC 197374.94
## 11                       BL10/J         AC 175373.72
## 12                      BL6/ByJ         AC 178864.78
## 13                     BL6/JEiJ         AC 180017.66
## 14                  BL10/ScNHsd         AC 182467.11
## 15                     BL6/NTac         AG  67256.45
colnames(abun_df) <- c("Sample", "kmer", "Copy")
strain <- sapply(strsplit(as.vector(abun_df$Sample), "\\/"), '[', 1)

abun_df <- data.frame(abun_df, strain)
#abun_df


library(ggplot2)
abun_df$kmer <- factor(abun_df$kmer, levels=colnames(data_sorted_common))
# this plot needs some work
ggplot(data = abun_df[c(1:70),], aes(x = kmer, y = Copy)) + 
  geom_boxplot() +
  geom_jitter(aes(color=strain)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1)) 

ggplot(data = abun_df[c(71:140),], aes(x = kmer, y = Copy)) + 
  geom_boxplot() +
  geom_jitter(aes(color=strain)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

Now, calculate mutation rates: Set it up with functions

# note, this part is needed to make the table I import in:
mouse_tree$tip.label
##  [1] "B10SnJ"           "B10ScSnJ"         "B10ScCr"         
##  [4] "B10ScNHsd"        "B10J"             "B6ByJ"           
##  [7] "B6NCrl"           "B6NHsd"           "B6NTac"          
## [10] "B6NTyrCBrdCrlCrl" "B6NJ"             "B6JEiJ"          
## [13] "B6JBomTac"        "B6J"
mouse_tree$Nnode # 13 nodes now
## [1] 13
findMRCA(mouse_tree, tips=mouse_tree$tip.label, type="node")
## [1] 15
findMRCA(mouse_tree, tips=c("B6J", "B6ByJ"), type="node")
## [1] 20
findMRCA(mouse_tree, tips=c("B6J", "B6JEiJ"), type="node")
## [1] 26
findMRCA(mouse_tree, tips=c("B6J", "B6JBomTac"), type="node")
## [1] 27
findMRCA(mouse_tree, tips=c("B6NJ", "B6ByJ"), type="node")
## [1] 21
findMRCA(mouse_tree, tips=c("B6NJ", "B6NCrl"), type="node")
## [1] 22
findMRCA(mouse_tree, tips=c("B6NJ", "B6NHsd"), type="node")
## [1] 23
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTac"), type="node")
## [1] 24
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTyrCBrdCrlCrl"), type="node")
## [1] 25
findMRCA(mouse_tree, tips=c("B10J", "B10SnJ"), type="node")
## [1] 16
findMRCA(mouse_tree, tips=c("B10J", "B10ScSnJ"), type="node")
## [1] 17
findMRCA(mouse_tree, tips=c("B10J", "B10ScCr"), type="node")
## [1] 18
findMRCA(mouse_tree, tips=c("B10J", "B10ScNHsd"), type="node")
## [1] 19
# here is the node info file:
node_info <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/node_info_Mar17-20.csv", header = T)

# here is a function to calculate mutation rates for our tree, given an abundance matrix

calc_mutation_rates <- function (df_abun) {
  mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
  for (i in 1:ncol(df_abun)) {
    kmer_subset <- df_abun[,i]
    names(kmer_subset) <- rownames(df_abun)
    anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
    for (j in 1:nrow(df_abun)) {
      subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
      mu_mx[j,i] <- (df_abun[j,i] - anc_states$ace[subset_table$nearest_node]) / subset_table$gens
    }
  }
  rownames(mu_mx) <- rownames(df_abun)
  colnames(mu_mx) <- colnames(df_abun)
  return(mu_mx)
}

# check for total kmer content
anc_states <- fastAnc(mouse_tree, total_kmer_content, vars=T, CI=T)
anc_states
## Ancestral character estimates using fastAnc:
##       15       16       17       18       19       20       21       22 
## 1.314349 1.347041 1.338262 1.309295 1.282353 1.281658 1.275193 1.270799 
##       23       24       25       26       27 
## 1.261257 1.299648 1.371612 1.283420 1.305079 
## 
## Variances on ancestral states:
##       15       16       17       18       19       20       21       22 
## 0.019104 0.005736 0.004946 0.004616 0.003865 0.005283 0.003684 0.003130 
##       23       24       25       26       27 
## 0.002635 0.002244 0.001809 0.003905 0.003184 
## 
## Lower & upper 95% CIs:
##       lower    upper
## 15 1.043446 1.585252
## 16 1.198596 1.495486
## 17 1.200424 1.476101
## 18 1.176135 1.442455
## 19 1.160505 1.404202
## 20 1.139196 1.424119
## 21 1.156236 1.394149
## 22 1.161153 1.380445
## 23 1.160639 1.361876
## 24 1.206795 1.392501
## 25 1.288246 1.454979
## 26 1.160935 1.405906
## 27 1.194482 1.415675
1.073765 - 1.264150 # lost by B6NHsd since common ancestor with below
## [1] -0.190385
1.469178 - 1.264150 # gained by B6NJ since common ancestor with below
## [1] 0.205028
# here is a function that calculates normalized-by-copy-number mutation rates, given the original abundance matrix and the non-normalized mutation matrix

calc_norm_mu <- function (df_abun, mu_mx) {
  mu_mx_norm <- matrix(nrow=nrow(mu_mx), ncol=ncol(mu_mx))
  for (i in 1:ncol(mu_mx)) {
    kmer_subset <- df_abun[,i]
    names(kmer_subset) <- rownames(df_abun)
    anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T) # get the ancestral copy number
    for (j in 1:nrow(mu_mx)) {
      subset_table <- node_info[which(node_info$Sample==rownames(mu_mx)[j]),]
      mu_mx_norm[j,i] <- mu_mx[j,i]/anc_states$ace[subset_table$nearest_node]
    }
  }
  rownames(mu_mx_norm) <- rownames(mu_mx)
  colnames(mu_mx_norm) <- colnames(mu_mx)
  return(mu_mx_norm)
}

# this is a function that calculates absolute mutation rates (taking the absolute value of additions and subtractions).

calc_mutation_rates_abs <- function (df_abun) {
  mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
  for (i in 1:ncol(df_abun)) {
    kmer_subset <- df_abun[,i]
    names(kmer_subset) <- rownames(df_abun)
    anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
    for (j in 1:nrow(df_abun)) {
      subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
      mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
    }
  }
  rownames(mu_mx) <- rownames(df_abun)
  colnames(mu_mx) <- colnames(df_abun)
  return(mu_mx)
}

Now, do the calculations for the common kmers

new_rownames <- gsub("/", "", rownames(data_sorted_common))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2

rownames(data_sorted_common) <- new_rownames2

# regular
mu_common <- calc_mutation_rates(data_sorted_common)
per_kmer_mu<- colMeans(mu_common)
length(per_kmer_mu)
## [1] 63
kmer_means_sorted <- sort (kmer_means, decreasing = T)
names(kmer_means_sorted)[1:63]
##  [1] "AC"                  "AG"                  "AAG"                
##  [4] "AACCCT"              "AGAT"                "AGAGGC"             
##  [7] "AAAG"                "AGG"                 "ACAGC"              
## [10] "AAGGAG"              "AAGG"                "ACC"                
## [13] "ATCC"                "AAAGG"               "ACAG"               
## [16] "AAGTCTGCACACACATACT" "ACCT"                "ACACAG"             
## [19] "AT"                  "AAGAG"               "AAGGG"              
## [22] "AGAGAGGC"            "ACATAT"              "ACAGACAGCACAGCACAGC"
## [25] "AGC"                 "AGAGG"               "AAAGACAGACAG"       
## [28] "AAAAACAAGGGAGATAT"   "AGGC"                "ACAGACAGCACAGC"     
## [31] "AACC"                "AAAAG"               "AAAGAG"             
## [34] "AGATAT"              "AGCAGG"              "ACAGAT"             
## [37] "AAAGGG"              "AACAAG"              "AAAAAG"             
## [40] "AGGAGGC"             "AAGC"                "AGAGGG"             
## [43] "AGAGAGGCAGAGGC"      "AAAGC"               "ACAGGC"             
## [46] "AAACAAGGGAGATAT"     "ACAGAG"              "ACAT"               
## [49] "ACT"                 "AGAGAT"              "AGGG"               
## [52] "AAAAAC"              "AAGAGG"              "AAAGGAAG"           
## [55] "AAAAATCTTAAAGG"      "ACTGCT"              "ACACAT"             
## [58] "AAACC"               "AAAACC"              "AAAAACC"            
## [61] "ACACATAT"            "AGCAGGAGGAGG"        "ACATAG"
plot(x = kmer_means_sorted[1:63], y = per_kmer_mu )

plot (x = kmer_means_sorted[7:63], y = per_kmer_mu[7:63])

# generally a positive correlation

# for each line, do a sign test.
binom_results <- c()
for (i in 1:nrow(mu_common)) {
  num_neg <- length(which(mu_common[i, ] < 0))
  total <- ncol(mu_common)
  p <- binom.test(num_neg, total)
  binom_results <- c(binom_results, p$p.value)
}
binom_results
##  [1] 8.980472e-04 1.000000e+00 5.152397e-03 4.295655e-02 3.135031e-01
##  [6] 5.152397e-03 2.227532e-03 3.760612e-05 8.013065e-01 3.367093e-04
## [11] 1.000000e+00 1.000000e+00 8.980472e-04 4.295655e-02
rownames(mu_common)[which(binom_results < 0.01)]
## [1] "B6NTac"   "B6NHsd"   "B6J"      "B10ScSnJ" "B10SnJ"   "B6NJ"    
## [7] "B6JEiJ"
### now do for each line - plot the common kmers abundance and mutation rate

#install.packages("ggallin")
library("ggallin")

rownames(data_sorted_common[1,])
## [1] "B6NTac"
a <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[1,]), y = as.numeric(mu_common[1,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[1,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[2,])
## [1] "B6NCrl"
b <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[2,]), y = as.numeric(mu_common[2,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[2,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[3,])
## [1] "B6NHsd"
c <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[3,]), y = as.numeric(mu_common[3,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[3,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[4,])
## [1] "B6NTyrCBrdCrlCrl"
d <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[4,]), y = as.numeric(mu_common[4,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[4,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[5,])
## [1] "B6JBomTac"
e <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[5,]), y = as.numeric(mu_common[5,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[5,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")


rownames(data_sorted_common[6,])
## [1] "B6J"
f <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[6,]), y = as.numeric(mu_common[6,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[6,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[7,])
## [1] "B10ScSnJ"
g <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[7,]), y = as.numeric(mu_common[7,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[7,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[8,])
## [1] "B10SnJ"
h <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[8,]), y = as.numeric(mu_common[8,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[8,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[9,])
## [1] "B10ScCr"
i <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[9,]), y = as.numeric(mu_common[9,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[9,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[10,])
## [1] "B6NJ"
j <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[10,]), y = as.numeric(mu_common[10,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[10,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[11,])
## [1] "B10J"
k <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[11,]), y = as.numeric(mu_common[11,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[11,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[12,])
## [1] "B6ByJ"
l <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[12,]), y = as.numeric(mu_common[12,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[12,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[13,])
## [1] "B6JEiJ"
m <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[13,]), y = as.numeric(mu_common[13,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[13,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[14,])
## [1] "B10ScNHsd"
n <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[14,]), y = as.numeric(mu_common[14,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[14,])) +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
grid.arrange(a,b,c,d,e,f,g,h,i,j,k,l,m,n, ncol=2)

#normalize by copy number

mu_common_norm <- calc_norm_mu(data_sorted_common, mu_common)


# per kmer average mutation rate

per_kmer_mu_norm_mean <- colMeans(mu_common_norm)
par(mar = c(2, 2, 2, 2))
plot(per_kmer_mu_norm_mean)

# one kmer has the most negative mutatation rate
which(per_kmer_mu_norm_mean==min(per_kmer_mu_norm_mean)) # normalized by copy number, most contracting
## AAAGACAGACAG 
##           27
# this is the one with phylogenetic signal
# verified this one manually to make sure there was no mistake.  

highest_kmer <- data_sorted_common[,27]
names(highest_kmer) <- rownames(data_sorted_common)

# look at amounts at each node
anc_states <- fastAnc(mouse_tree, highest_kmer, vars=T, CI=T)
anc_states$ace
##        15        16        17        18        19        20        21 
## 420.27294  72.76111  57.51539  43.91212  21.99731 767.78476 788.11806 
##        22        23        24        25        26        27 
## 805.51032 851.40148 864.22009 872.23266 824.66497 846.29339
obj<-contMap(mouse_tree,highest_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

par(mar = c(2, 2, 2, 2))
per_line_mu_norm_mean <- rowMeans(mu_common_norm)
plot(per_line_mu_norm_mean) # line 10 is most increasing, line 3 is most decreasing

per_line_mu_norm_mean
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##    -5.108446e-04     1.018382e-04    -9.047817e-04     2.279445e-04 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##    -2.278171e-04     4.288457e-04     1.220611e-04     1.862966e-04 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##    -3.973537e-04     8.150201e-04     2.706781e-05    -3.471784e-05 
##           B6JEiJ        B10ScNHsd 
##    -1.783642e-04    -3.200731e-04
# look at the telomere satellite

telomere_kmer <- data_sorted_common[,4]
names(telomere_kmer) <- rownames(data_sorted_common)

telomere_kmer
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         30146.60         24934.31         26172.35         31314.12 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         25013.94         45939.98         46029.90         40211.11 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         35171.94         39748.29         22781.08         30818.52 
##           B6JEiJ        B10ScNHsd 
##         24649.66         26089.82
# look at amounts at each node
anc_states <- fastAnc(mouse_tree, telomere_kmer, vars=T, CI=T)
anc_states$ace
##       15       16       17       18       19       20       21       22 
## 33122.94 35614.98 35362.01 33648.22 28314.51 30630.91 30000.19 29604.91 
##       23       24       25       26       27 
## 30121.95 31568.94 33730.17 30980.68 33096.57
obj<-contMap(mouse_tree,telomere_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# range 22781 - 46030 - factor of 2.

# look at CAG repeats (polyQ)

which(names((data_sorted_common))=="AGC")
## [1] 25
poly_Q <- data_sorted_common[,25]
names(poly_Q) <- rownames(data_sorted_common)

anc_states <- fastAnc(mouse_tree, poly_Q, vars=T, CI=T)
anc_states$ace
##       15       16       17       18       19       20       21       22 
## 587.9395 588.3390 589.6813 588.8991 584.2025 587.5399 584.7799 584.6911 
##       23       24       25       26       27 
## 585.0344 585.1220 585.3344 591.7505 587.1016
obj<-contMap(mouse_tree,poly_Q,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

poly_Q # very tight: 572-613 copies
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         584.8186         582.6070         585.9047         588.1745 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         572.0243         591.7187         609.6128         573.4039 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         595.1478         582.8482         577.5038         577.1086 
##           B6JEiJ        B10ScNHsd 
##         613.4282         584.0698
## Let's make a box plot of mutation rates ###

head(mu_common_norm)
##                             AC            AG           AAG        AACCCT
## B6NTac           -1.665755e-04  0.0004075430 -0.0016936235 -0.0004374261
## B6NCrl           -1.037055e-04  0.0005307961  0.0010510781 -0.0008913244
## B6NHsd           -1.520792e-04 -0.0015642071 -0.0033459144 -0.0009501471
## B6NTyrCBrdCrlCrl  4.070876e-05  0.0010917158  0.0012808578 -0.0011192028
## B6JBomTac        -1.399663e-04  0.0004652882  0.0004670982 -0.0021052905
## B6J               4.700042e-04 -0.0004409409 -0.0009093677  0.0033453313
##                           AGAT        AGAGGC          AAAG           AGG
## B6NTac           -0.0036641401 -3.605975e-04 -0.0015142443 -3.295951e-04
## B6NCrl            0.0015920712 -3.000287e-04  0.0010430853 -1.410237e-04
## B6NHsd           -0.0043442036  2.057965e-06 -0.0024647120 -6.330540e-04
## B6NTyrCBrdCrlCrl  0.0009625905  2.326053e-04  0.0015827283 -4.349942e-05
## B6JBomTac         0.0017003030 -2.519822e-05  0.0008803666 -3.848845e-04
## B6J              -0.0017989410  2.586653e-04 -0.0014021171  7.395840e-04
##                          ACAGC        AAGGAG          AAGG           ACC
## B6NTac           -6.017285e-05 -0.0002962075  7.938251e-05 -0.0006369315
## B6NCrl           -6.596555e-04  0.0003701609  4.853483e-04 -0.0001221603
## B6NHsd            3.251943e-04 -0.0023512420 -1.946080e-03 -0.0002215841
## B6NTyrCBrdCrlCrl  1.249040e-03  0.0008304876  8.059319e-04 -0.0005678451
## B6JBomTac        -2.921477e-04 -0.0004308801  2.956358e-04 -0.0005847944
## B6J               8.914476e-04  0.0007966639 -1.894608e-05  0.0008934752
##                           ATCC         AAAGG          ACAG
## B6NTac           -4.760371e-04  0.0001649562 -9.431094e-05
## B6NCrl            2.326906e-05  0.0007292390 -8.334183e-05
## B6NHsd           -9.738023e-04 -0.0032640963 -1.418911e-03
## B6NTyrCBrdCrlCrl -2.380564e-04  0.0010688596  2.446654e-03
## B6JBomTac        -2.671971e-04  0.0004943666  3.432759e-04
## B6J               5.718671e-04 -0.0003942336  6.148712e-05
##                  AAGTCTGCACACACATACT          ACCT        ACACAG
## B6NTac                 -0.0003693063 -0.0005101438 -0.0002956678
## B6NCrl                 -0.0007367637  0.0006628411 -0.0001270817
## B6NHsd                  0.0003864204 -0.0042674535 -0.0001753629
## B6NTyrCBrdCrlCrl       -0.0006380546  0.0026445685  0.0004140957
## B6JBomTac              -0.0012574995  0.0006555566 -0.0003819965
## B6J                     0.0017008338 -0.0005348536  0.0006864821
##                            AT         AAGAG         AAGGG      AGAGAGGC
## B6NTac           -0.002822038 -0.0002509086  8.403835e-05 -4.090651e-04
## B6NCrl            0.001638562  0.0007967816  9.550605e-05 -9.735448e-05
## B6NHsd           -0.004908572 -0.0027839155 -8.956826e-04 -9.252141e-05
## B6NTyrCBrdCrlCrl -0.001023197  0.0008232176  1.132484e-03  8.045671e-04
## B6JBomTac         0.001204549  0.0008527776  5.331611e-05 -2.558212e-05
## B6J              -0.000407570 -0.0006053300  3.352709e-04  1.654990e-04
##                         ACATAT ACAGACAGCACAGCACAGC           AGC
## B6NTac           -8.973783e-04       -0.0001011831 -5.035019e-06
## B6NCrl            3.277600e-04       -0.0006282529 -2.013794e-05
## B6NHsd           -8.395672e-04        0.0005911077  1.077951e-05
## B6NTyrCBrdCrlCrl  6.342445e-04        0.0012820218  7.581416e-05
## B6JBomTac        -7.849088e-05       -0.0004538098 -2.213871e-04
## B6J               2.805531e-04        0.0008161647  6.779565e-05
##                          AGAGG  AAAGACAGACAG AAAAACAAGGGAGATAT
## B6NTac           -5.451441e-04  0.0001919794     -0.0006024016
## B6NCrl            1.976184e-04 -0.0007518187     -0.0007702676
## B6NHsd           -1.271293e-03  0.0009523442      0.0027391458
## B6NTyrCBrdCrlCrl  6.162522e-04 -0.0005997973     -0.0013208665
## B6JBomTac         3.186189e-04 -0.0010862144     -0.0010467956
## B6J               9.018506e-05  0.0015819253      0.0009937361
##                           AGGC ACAGACAGCACAGC          AACC         AAAAG
## B6NTac           -0.0020117145   4.751776e-05 -6.836915e-04  0.0001320768
## B6NCrl           -0.0016632974  -7.066192e-04  2.378089e-04  0.0001860139
## B6NHsd            0.0004286222   6.277708e-04  7.886844e-05  0.0001341426
## B6NTyrCBrdCrlCrl -0.0024026780   1.163767e-03  3.846697e-04  0.0003208179
## B6JBomTac        -0.0044165306  -4.766036e-04  2.781311e-04  0.0003026365
## B6J               0.0048059615   9.447221e-04 -3.292601e-04 -0.0004063741
##                         AAAGAG        AGATAT        AGCAGG        ACAGAT
## B6NTac           -0.0016948721 -2.063519e-03 -8.270165e-04 -5.103155e-04
## B6NCrl            0.0008036940  1.502680e-03 -4.141238e-04  2.764000e-04
## B6NHsd           -0.0034238043 -1.268949e-03  5.412703e-05 -2.417208e-03
## B6NTyrCBrdCrlCrl  0.0008106058 -2.490464e-04 -5.730695e-04  9.071935e-05
## B6JBomTac         0.0010697903 -2.440358e-04 -9.435134e-04  2.287869e-04
## B6J              -0.0011144041  6.836742e-05  1.548868e-03  1.104506e-04
##                         AAAGGG        AACAAG        AAAAAG       AGGAGGC
## B6NTac           -4.677551e-04  2.546155e-05  0.0002801716 -9.187078e-04
## B6NCrl            6.409462e-05  8.371351e-04 -0.0004327796 -5.548012e-04
## B6NHsd           -1.681945e-03 -2.853162e-03  0.0001672457  1.284541e-05
## B6NTyrCBrdCrlCrl  7.203940e-04  2.164467e-04  0.0001700484 -1.205617e-03
## B6JBomTac        -5.027060e-04 -6.728850e-04 -0.0005210630 -5.620871e-04
## B6J               1.042552e-03  8.818891e-07  0.0002534426  1.029203e-03
##                           AAGC        AGAGGG AGAGAGGCAGAGGC         AAAGC
## B6NTac           -5.434884e-04  0.0004652802  -3.312943e-04  3.483304e-04
## B6NCrl            7.403019e-04 -0.0001436764   8.352441e-05  7.155242e-05
## B6NHsd           -3.233108e-04 -0.0007647574  -1.441311e-04  8.826565e-05
## B6NTyrCBrdCrlCrl  2.236890e-04  0.0011788727   4.182530e-04  5.895693e-04
## B6JBomTac        -6.453794e-05  0.0007462053   3.510603e-05  7.007548e-04
## B6J               2.297196e-04 -0.0006735322   2.077488e-04 -6.941516e-04
##                         ACAGGC AAACAAGGGAGATAT        ACAGAG          ACAT
## B6NTac           -0.0005248930   -0.0008268581  0.0008632881 -0.0010080044
## B6NCrl           -0.0008088567   -0.0003203606 -0.0001639874 -0.0001848299
## B6NHsd           -0.0002515918    0.0009086674 -0.0005174375 -0.0003440389
## B6NTyrCBrdCrlCrl -0.0003289970   -0.0006655922 -0.0005338290  0.0014361438
## B6JBomTac        -0.0005080720   -0.0006975355 -0.0002898178 -0.0005412730
## B6J               0.0012970047    0.0008500562  0.0006739503  0.0003718677
##                            ACT        AGAGAT          AGGG        AAAAAC
## B6NTac           -3.081124e-04 -0.0029479031  0.0005669797  0.0002932379
## B6NCrl           -7.990339e-05  0.0013842437 -0.0001663411 -0.0012957452
## B6NHsd           -3.130408e-03 -0.0038984548 -0.0005714235  0.0046463337
## B6NTyrCBrdCrlCrl -8.992007e-04 -0.0007953841  0.0009922715 -0.0014301993
## B6JBomTac         3.735892e-04  0.0001349109  0.0003589676 -0.0077436549
## B6J               2.365849e-04 -0.0005866912 -0.0004142656  0.0056165427
##                         AAGAGG      AAAGGAAG AAAAATCTTAAAGG        ACTGCT
## B6NTac            0.0001734283 -0.0001211827   6.059315e-05 -0.0005373543
## B6NCrl            0.0010380652  0.0011953256   2.555483e-04 -0.0008290947
## B6NHsd           -0.0038070492 -0.0034883860   3.463674e-03 -0.0004780371
## B6NTyrCBrdCrlCrl  0.0020798018  0.0009448833   5.935478e-04 -0.0011862780
## B6JBomTac         0.0002366203  0.0009352782   2.402745e-03 -0.0007848794
## B6J               0.0000189325 -0.0009428198  -1.437855e-04  0.0004148049
##                         ACACAT         AAACC        AAAACC       AAAAACC
## B6NTac           -0.0003328363  4.595988e-05 -0.0008159407  0.0002409840
## B6NCrl            0.0003042913  1.862295e-04 -0.0006498141 -0.0003272137
## B6NHsd           -0.0010053734 -3.836788e-04  0.0013755596  0.0021085351
## B6NTyrCBrdCrlCrl -0.0017003803 -3.084245e-04 -0.0009651283 -0.0009297406
## B6JBomTac        -0.0006450958  2.966610e-04 -0.0007742960 -0.0015244409
## B6J               0.0012565483 -2.042011e-04  0.0012831101  0.0017776114
##                       ACACATAT  AGCAGGAGGAGG        ACATAG
## B6NTac           -0.0016465201 -4.637637e-06 -9.896685e-04
## B6NCrl            0.0014689497 -5.021070e-04  9.122955e-04
## B6NHsd           -0.0040666019 -2.520583e-05 -2.393602e-03
## B6NTyrCBrdCrlCrl  0.0010072023  3.323291e-04  3.911062e-04
## B6JBomTac         0.0009190992 -3.435794e-04 -6.417093e-05
## B6J              -0.0009905018  4.401858e-04  8.034789e-04
# need to get it in format to make a box plot. Want to use mu_common_norm

mutation_rate_data <- c()
kmer_names <- c()

for (i in 1:nrow(mu_common_norm)) {
  mutation_rate_data <- c(mutation_rate_data, as.numeric(mu_common_norm[i,]) )
  kmer_names <- c(kmer_names, colnames(mu_common_norm))
}

mutation_rate_df <- data.frame(mutation_rate_data, kmer_names)
head(mutation_rate_df)
##   mutation_rate_data kmer_names
## 1      -0.0001665755         AC
## 2       0.0004075430         AG
## 3      -0.0016936235        AAG
## 4      -0.0004374261     AACCCT
## 5      -0.0036641401       AGAT
## 6      -0.0003605975     AGAGGC
# need to mess with the dimensions
par(mar = c(7,5,2,2))
boxplot (mutation_rate_df[,1] ~ mutation_rate_df[,2], data=mutation_rate_df, las=2, cex.axis=0.5, ylab="Copies changed/gen/copy")
abline(h=0, col="red", lwd=2)

# a plot for the hypermutators - part I (later add in complex)
# add in all the 14 samples
subset_hypermutators <- mu_common_norm[which(rownames(mu_common_norm)=="B6NHsd" | rownames(mu_common_norm)=="B6NJ"),]

# make this into a dataframe that can be a boxplot.
abun_hypermut <- c()
for (j in 1:ncol(mu_common_norm)) {
  for (i in 1:nrow(mu_common_norm)) {
    abun_hypermut <- c(abun_hypermut, mu_common_norm[i,j])
  }
}

line_names <- c(rep(rownames(mu_common_norm), times = ncol(mu_common_norm) ))

kmer_names <- c()
for (i in 1:ncol(mu_common_norm)) {
  kmer_names <- c(kmer_names, c(rep(colnames(mu_common_norm)[i], times = nrow(mu_common_norm))))
}

#now make a dataframe

hypermutator_df <- data.frame(line_names, kmer_names, abun_hypermut)
#abun_df

#hypermutator_df$line_names <- factor(hypermutator_df$line_names, levels=c("B6NHsd","B6NJ"))
# this plot needs some work
ggplot(data = hypermutator_df, aes(x = line_names, y = abun_hypermut)) + 
  geom_boxplot() +
  geom_jitter(aes(color=line_names)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1)) 

# now absolute mutation rates  

mu_common_abs <- calc_mutation_rates_abs(data_sorted_common)
mu_common_abs_norm <- calc_norm_mu(data_sorted_common, mu_common_abs)

per_kmer_mu_abs_norm_mean <- colMeans(mu_common_abs_norm)
sort(per_kmer_mu_abs_norm_mean, decreasing=T)[1:10]
## AAAAATCTTAAAGG         AAAAAC   AAAGACAGACAG           AGAT           AGGC 
##    0.002199034    0.002024336    0.001949065    0.001690533    0.001666607 
##             AT         AGAGAT       ACACATAT         AACCCT         AAAGAG 
##    0.001665801    0.001490404    0.001242575    0.001194939    0.001164543
per_line_mu_abs_norm_mean <- rowMeans(mu_common_abs_norm)

plot(per_kmer_mu_abs_norm_mean) # huh, when we take the absolute value of changes, it doesn't stand out.  Only when we take the directional value

which(per_kmer_mu_abs_norm_mean==max(per_kmer_mu_abs_norm_mean))
## AAAAATCTTAAAGG 
##             55
range(per_kmer_mu_abs_norm_mean)
## [1] 7.796324e-05 2.199034e-03
sort (per_kmer_mu_abs_norm_mean)
##                 AGC      AGAGAGGCAGAGGC                  AC 
##        7.796324e-05        1.682046e-04        2.065710e-04 
##            AGAGAGGC                AACC              AGAGGC 
##        2.146289e-04        2.208613e-04        2.348638e-04 
##               AAAAG        AGCAGGAGGAGG              ACACAG 
##        2.540561e-04        2.579746e-04        2.894117e-04 
##                AAGC               AAACC               AAAGC 
##        3.236761e-04        3.358232e-04        3.458564e-04 
##              ACATAT              AAAAAG               AAGGG 
##        3.556181e-04        3.626719e-04        3.646905e-04 
##                AAGG                 AGG              ACAGAG 
##        3.823177e-04        3.945726e-04        4.020140e-04 
##               AGAGG              AGAGGG                AGGG 
##        4.021618e-04        4.082530e-04        4.225192e-04 
##                ATCC                 ACC                ACAT 
##        4.298685e-04        4.387321e-04        4.458417e-04 
##                  AG     AAACAAGGGAGATAT                ACAG 
##        4.701614e-04        5.491447e-04        5.680881e-04 
##              ACAGGC              AACAAG              AAGGAG 
##        6.252987e-04        6.364125e-04        6.367376e-04 
##              AAAGGG AAGTCTGCACACACATACT              ACAGAT 
##        6.376478e-04        6.483114e-04        6.495259e-04 
##              ACACAT              ACTGCT             AGGAGGC 
##        6.591535e-04        6.928947e-04        6.999353e-04 
##               AAGAG              AGCAGG               AAAGG 
##        7.261712e-04        7.288115e-04        7.304379e-04 
##              ACATAG              AAGAGG              AAAACC 
##        7.437793e-04        7.910306e-04        7.939359e-04 
##   AAAAACAAGGGAGATAT              AGATAT            AAAGGAAG 
##        8.620851e-04        8.624179e-04        9.217655e-04 
##                 ACT                AAAG             AAAAACC 
##        9.223615e-04        9.625643e-04        9.724336e-04 
##               ACAGC                ACCT                 AAG 
##        9.962429e-04        1.057811e-03        1.098304e-03 
##      ACAGACAGCACAGC ACAGACAGCACAGCACAGC              AAAGAG 
##        1.129597e-03        1.139886e-03        1.164543e-03 
##              AACCCT            ACACATAT              AGAGAT 
##        1.194939e-03        1.242575e-03        1.490404e-03 
##                  AT                AGGC                AGAT 
##        1.665801e-03        1.666607e-03        1.690533e-03 
##        AAAGACAGACAG              AAAAAC      AAAAATCTTAAAGG 
##        1.949065e-03        2.024336e-03        2.199034e-03
# AGC actually has the lowest mutation rate (can I give it a Z score?)
mu <- mean(per_kmer_mu_abs_norm_mean)
sigma <- sd(per_kmer_mu_abs_norm_mean)
(AGC_zscore <- abs(7.584906e-05 - mu)/sigma)
## [1] 1.384379
AGAT <- data_sorted_common[,5]
names(AGAT) <- rownames(data_sorted_common)
obj<-contMap(mouse_tree,AGAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

par(mar = c(2, 2, 2, 2))
plot(per_line_mu_abs_norm_mean) # 3 and 10 are the highest

mean(mu_common_abs_norm)
## [1] 0.0007450784
# 7.45 E-4 - lower than Daphnia, higher than Chlamy
# very similar to daphnia, daphnia is 2.74 E-3. 

Not many kmers with a phylogenetic signal

Look for kmers with a phylo signal, using the legit kmers

calc_mutation_rates_abs_random <- function (df_abun) {
  mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
  for (i in 1:ncol(df_abun)) {
    anc_states <- fastAnc(mouse_tree, sample(df_abun[,i]), vars=T, CI=T)
    for (j in 1:nrow(df_abun)) {
      subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
      mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
    }
  }
  rownames(mu_mx) <- rownames(df_abun)
  colnames(mu_mx) <- colnames(df_abun)
  return(mu_mx)
}

new_rownames <- gsub("/", "", rownames(legit_data))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2

rownames(legit_data) <- new_rownames2

mu_mx_rand_1 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_1 <- calc_norm_mu(legit_data, mu_mx_rand_1)
plot(colMeans(mu_mx_rand_norm_1))

#great, we have a few outliers

sort(colMeans(mu_mx_rand_norm_1), decreasing=T)[1:7]
##      AAAGACAGACAG          AAAGACAG           AGCATGG         AAAGAAAGG 
##       0.052676699       0.031799341       0.011855337       0.006614550 
##          AAATGAAT   AAGATAGATAGATCT AAAGAAAGAAAGAAAGG 
##       0.006336106       0.005061688       0.004908095
which(colnames(legit_data)=="AAAGACAG")
## [1] 104
which(colnames(legit_data)=="AAAGACAGACAG")
## [1] 27
which(colnames(legit_data)=="AGCATGG")
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG")
## [1] 80
which(colnames(legit_data)=="AAATGAAT")
## [1] 230
which(colnames(legit_data)=="AAGATAGATAGATCT")
## [1] 433
which(colnames(legit_data)=="AAAGAAAGAAAGAAAGG")
## [1] 133
AAAGACAG <- legit_data[,104]
names(AAAGACAG) <- rownames(legit_data)

AAAGACAG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        68.345280        57.793219        86.259274        71.408165 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##        49.622705        94.690465         0.000000         0.000000 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         0.000000        81.809260         1.901294        51.787538 
##           B6JEiJ        B10ScNHsd 
##        62.088058         0.000000
obj<-contMap(mouse_tree,AAAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGACAGACAG <- legit_data[,27]
names(AAAGACAGACAG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AAAGACAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGACAGACAG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##       881.309078       698.319519       963.295641       838.750237 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##       739.659689      1001.591049         5.401604         4.415329 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         5.880901       919.069355         8.217183       729.856715 
##           B6JEiJ        B10ScNHsd 
##       843.105703         3.901369
# yes good one
AGCATGG <- legit_data[,427]
names(AGCATGG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AGCATGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AGCATGG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         4.234116         2.470631         2.183164         1.252682 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         2.135496         2.137580        15.617626        14.967658 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##        17.962139         3.360821        11.100449         1.949454 
##           B6JEiJ        B10ScNHsd 
##         1.188934         9.442596
AAAGAAAGG <- legit_data[,80]
names(AAAGAAAGG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGAAAGG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        40.503028        95.985217        10.194145       115.770502 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##       140.923140        50.373755        26.822014        27.493762 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         4.731663       202.989221        24.439834        58.854640 
##           B6JEiJ        B10ScNHsd 
##        95.534331        24.380243
AAATGAAT <- legit_data[,230]
names(AAATGAAT) <- rownames(legit_data)
# not really, just one pair.
obj<-contMap(mouse_tree,AAATGAAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# no


AAGATAGATAGATCT <- legit_data[,433]
names(AAGATAGATAGATCT) <- rownames(legit_data)
# nothing here
obj<-contMap(mouse_tree,AAGATAGATAGATCT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGAAAGAAAGAAAGG <- legit_data[,133]
names(AAAGAAAGAAAGAAAGG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AAAGAAAGAAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# do it again and see if you get any additional

mu_mx_rand_2 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_2 <- calc_norm_mu(legit_data, mu_mx_rand_2)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_2))

mu_mx_rand_3 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_3 <- calc_norm_mu(legit_data, mu_mx_rand_3)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_3))

# nope, no others are having phylo signal

# have 6 kmers with phylogenetic signal

Now complex satellites

complex <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/complex_revised_cn.csv", header=T)
samples <- as.vector(complex$Sample)
complex <- as.matrix(complex[,c(2:3)])
rownames(complex) <- samples
complex
##                   cn_major cn_minor
## B6JEiJ           1048990.8 123362.6
## B10ScNHsd        1060820.1 135657.5
## B6NHsd           1317473.6 129958.4
## B6NTac           1207882.2 136154.8
## B6NJ             1031331.9 129371.0
## B6NTyrCBrdCrlCrl 1239977.1 137196.5
## B6JBomTac         999881.7 130793.7
## B6J              1118980.8 124911.5
## B10ScCr          1068573.0 128685.2
## B10SnJ            923277.8 129303.0
## B6ByJ            1069595.9 125996.3
## B6NCrl            954977.0 123889.9
## B10ScSnJ          971019.3 133243.0
## B10J             1056396.0 143527.2
# visualize:
range(complex[,1])
## [1]  923277.8 1317473.6
# first, major
obj<-contMap(mouse_tree,complex[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# B6NHsd expansion (1317473), B6NJ loss (1031331.9)

complex[,1]
##           B6JEiJ        B10ScNHsd           B6NHsd           B6NTac 
##        1048990.8        1060820.1        1317473.6        1207882.2 
##             B6NJ B6NTyrCBrdCrlCrl        B6JBomTac              B6J 
##        1031331.9        1239977.1         999881.7        1118980.8 
##          B10ScCr           B10SnJ            B6ByJ           B6NCrl 
##        1068573.0         923277.8        1069595.9         954977.0 
##         B10ScSnJ             B10J 
##         971019.3        1056396.0
anc_states <- fastAnc(mouse_tree, complex[,1], vars=T, CI=T)
anc_states 
## Ancestral character estimates using fastAnc:
##      15      16      17      18      19      20      21      22      23 
## 1045425 1009114 1014464 1027204 1045385 1081737 1098129 1109461 1157943 
##      24      25      26      27 
## 1161155 1147246 1065387 1062584 
## 
## Variances on ancestral states:
##          15          16          17          18          19          20 
## 16995854715  5103246361  4400057772  4106422853  3438407147  4700132074 
##          21          22          23          24          25          26 
##  3277108126  2784220764  2344618027  1996686318  1609537905  3474444128 
##          27 
##  2832675811 
## 
## Lower & upper 95% CIs:
##        lower   upper
## 15  789903.6 1300947
## 16  869097.3 1149130
## 17  884451.9 1144477
## 18  901604.9 1152804
## 19  930455.1 1160316
## 20  947363.8 1216109
## 21  985926.4 1210331
## 22 1006040.4 1212882
## 23 1063037.1 1252848
## 24 1073573.6 1248736
## 25 1068612.2 1225879
## 26  949856.0 1180918
## 27  958267.5 1166901
#major satellite at ancestor of B6NHsd and B6NJ: 1157943
# gained by B6NHsd:
1317473-1157943 #159,530 copies 
## [1] 159530
# lost by B6NJ
1157943-1031331.9 # 126,611 copies lost
## [1] 126611.1
# second, minor
obj<-contMap(mouse_tree,complex[,2],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

complex[,2] # b6NHsd: 129958.4 , B6NJ: 129371
##           B6JEiJ        B10ScNHsd           B6NHsd           B6NTac 
##         123362.6         135657.5         129958.4         136154.8 
##             B6NJ B6NTyrCBrdCrlCrl        B6JBomTac              B6J 
##         129371.0         137196.5         130793.7         124911.5 
##          B10ScCr           B10SnJ            B6ByJ           B6NCrl 
##         128685.2         129303.0         125996.3         123889.9 
##         B10ScSnJ             B10J 
##         133243.0         143527.2
anc_states <- fastAnc(mouse_tree, complex[,2], vars=T, CI=T)
anc_states 
## Ancestral character estimates using fastAnc:
##       15       16       17       18       19       20       21       22 
## 130286.5 132589.0 133019.9 133565.6 137054.8 127984.1 128363.0 128873.8 
##       23       24       25       26       27 
## 130624.7 132347.7 132858.3 126804.8 127297.9 
## 
## Variances on ancestral states:
##       15       16       17       18       19       20       21       22 
## 35461212 10647732  9180555  8567897  7174107  9806649  6837563  5809172 
##       23       24       25       26       27 
##  4891957  4166011  3358240  7249297  5910272 
## 
## Lower & upper 95% CIs:
##       lower    upper
## 15 118614.9 141958.2
## 16 126193.3 138984.6
## 17 127081.2 138958.6
## 18 127828.5 139302.7
## 19 131805.0 142304.5
## 20 121846.3 134122.0
## 21 123237.8 133488.1
## 22 124149.8 133597.9
## 23 126289.6 134959.8
## 24 128347.1 136348.2
## 25 129266.5 136450.1
## 26 121527.6 132082.0
## 27 122532.9 132062.8
# 130624.7 
# how many copies and kb did these lines lose and gain?
130624.7 -129958.4 # B6NHsd : lost 666 copies
## [1] 666.3
130624.7 - 129371 # B6NJ : lost 1253 copies
## [1] 1253.7
mu_complex <- calc_mutation_rates(complex)
#mu_complex

mu_complex_norm <- calc_norm_mu(complex, mu_complex)

hypermutator_df$type <- "simple"
head(hypermutator_df)
##         line_names kmer_names abun_hypermut   type
## 1           B6NTac         AC -1.665755e-04 simple
## 2           B6NCrl         AC -1.037055e-04 simple
## 3           B6NHsd         AC -1.520792e-04 simple
## 4 B6NTyrCBrdCrlCrl         AC  4.070876e-05 simple
## 5        B6JBomTac         AC -1.399663e-04 simple
## 6              B6J         AC  4.700042e-04 simple
abun_complex <- c()
for (j in 1:ncol(mu_complex_norm)) {
  for (i in 1:nrow(mu_complex_norm)) {
    abun_complex <- c(abun_complex, mu_complex_norm[i,j])
  }
}

line_names <- c(rep(rownames(mu_complex_norm), times = ncol(mu_complex_norm) ))

kmer_names <- c()
for (i in 1:ncol(mu_complex_norm)) {
  kmer_names <- c(kmer_names, c(rep(colnames(mu_complex_norm)[i], times = nrow(mu_complex_norm))))
}

#now make a dataframe

complex_df <- data.frame(line_names, kmer_names, abun_complex)
complex_df$type <- "complex"
head(complex_df)
##         line_names kmer_names  abun_complex    type
## 1           B6JEiJ   cn_major -0.0000916069 complex
## 2        B10ScNHsd   cn_major  0.0001069893 complex
## 3           B6NHsd   cn_major  0.0009983401 complex
## 4           B6NTac   cn_major  0.0003907011 complex
## 5             B6NJ   cn_major -0.0015786951 complex
## 6 B6NTyrCBrdCrlCrl   cn_major  0.0012629646 complex
colnames(complex_df) <- c("line_names", "kmer_names", "abun", "type")
colnames(hypermutator_df) <- c("line_names", "kmer_names", "abun", "type")

perline_boxplot_df <- rbind(hypermutator_df, complex_df)

#perline_boxplot_df$line_names <- factor(perline_boxplot_df$line_names, levels = c(""))
## make the plot including mu_complex
ggplot(data = perline_boxplot_df, aes(x = line_names, y = abun)) + 
  geom_boxplot() +
  geom_jitter(aes(color=type, alpha=0.6)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

mu_complex_abs <- calc_mutation_rates_abs(complex)
mean(mu_complex_abs[,2])
## [1] 26.05974
mu_complex_abs_norm <- calc_norm_mu(complex, mu_complex_abs)
#mu_complex_abs_norm

colMeans(mu_complex_abs)
##  cn_major  cn_minor 
## 559.13985  26.05974
colMeans(mu_complex_abs_norm)
##     cn_major     cn_minor 
## 0.0005004309 0.0001976812
#major: 4.8 E-4, minor: 1.9 E-4
# major satellite has a higher mutation rate than minor, even after normalizing
# might be because of increased recombination -- more distal to the centromere


library(corrplot)

combined_mu_norm <- cbind(mu_complex_norm, mu_common_norm)
combined_mu_norm[,1:10]
##                       cn_major      cn_minor            AC            AG
## B6JEiJ           -9.160690e-05 -1.615809e-04 -1.665755e-04  4.075430e-04
## B10ScNHsd         1.069893e-04 -7.387808e-05 -1.037055e-04  5.307961e-04
## B6NHsd            9.983401e-04 -3.696561e-05 -1.520792e-04 -1.564207e-03
## B6NTac            3.907011e-04  2.792860e-04  4.070876e-05  1.091716e-03
## B6NJ             -1.578695e-03 -4.101223e-04 -1.399663e-04  4.652882e-04
## B6NTyrCBrdCrlCrl  1.262965e-03  5.102066e-04  4.700042e-04 -4.409409e-04
## B6JBomTac        -5.087033e-04  2.367385e-04  1.526023e-04  1.838519e-04
## B6J               4.575415e-04 -1.616087e-04  9.140961e-05  4.974917e-05
## B10ScCr           1.728454e-04 -1.568226e-04  2.490011e-04 -7.222850e-04
## B10SnJ           -2.893223e-04 -8.429768e-05  4.230222e-04 -5.365049e-04
## B6ByJ            -1.255219e-04 -8.906739e-05 -3.731076e-04  3.595034e-04
## B6NCrl           -7.866809e-04 -2.184930e-04 -1.704128e-04  9.820323e-05
## B10ScSnJ         -1.597972e-04  6.259891e-06 -2.642960e-04 -6.374373e-05
## B10J              7.632285e-05  3.422101e-04 -9.510261e-05  6.792768e-05
##                            AAG        AACCCT          AGAT        AGAGGC
## B6JEiJ           -0.0016936235 -0.0004374261 -0.0036641401 -3.605975e-04
## B10ScNHsd         0.0010510781 -0.0008913244  0.0015920712 -3.000287e-04
## B6NHsd           -0.0033459144 -0.0009501471 -0.0043442036  2.057965e-06
## B6NTac            0.0012808578 -0.0011192028  0.0009625905  2.326053e-04
## B6NJ              0.0004670982 -0.0021052905  0.0017003030 -2.519822e-05
## B6NTyrCBrdCrlCrl -0.0009093677  0.0033453313 -0.0017989410  2.586653e-04
## B6JBomTac         0.0005484810  0.0011256591  0.0001359405  2.273234e-04
## B6J               0.0005808924  0.0004389475  0.0012644830  9.134611e-05
## B10ScCr          -0.0020978667  0.0001943518 -0.0025471971  2.915746e-04
## B10SnJ            0.0016646539  0.0027878050  0.0037813664  5.069733e-04
## B6ByJ             0.0007397267 -0.0014161398  0.0005660927 -4.350530e-04
## B6NCrl            0.0005409763  0.0001317758  0.0005682900 -1.447509e-04
## B10ScSnJ          0.0002449262 -0.0012163915 -0.0002705450 -2.517623e-04
## B10J              0.0002107909 -0.0005693525  0.0004712985 -1.601563e-04
##                           AAAG           AGG
## B6JEiJ           -1.514244e-03 -3.295951e-04
## B10ScNHsd         1.043085e-03 -1.410237e-04
## B6NHsd           -2.464712e-03 -6.330540e-04
## B6NTac            1.582728e-03 -4.349942e-05
## B6NJ              8.803666e-04 -3.848845e-04
## B6NTyrCBrdCrlCrl -1.402117e-03  7.395840e-04
## B6JBomTac        -1.279014e-06  2.846829e-04
## B6J               4.982097e-04  1.979003e-04
## B10ScCr          -1.573901e-03  3.587638e-04
## B10SnJ            5.719602e-04  1.111715e-03
## B6ByJ             4.967773e-04 -9.523429e-05
## B6NCrl            6.199326e-04 -1.910845e-04
## B10ScSnJ          3.594212e-04 -2.978637e-04
## B10J              4.671663e-04 -7.151320e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)

#cor_mu$AAAAACAAGGAGATAT
#cor_mu[,"AAAAACAAGGAGATAT"]
#cor_mu[,30]

# relate copy number changes to sequence variability in the complex satellites

variability <- read.csv(file="~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/variation_centromere_sats.csv", header = T)
variability
##    File              Line error_rate_major error_rate_minor   mu_major
## 1  R104           BL6NTac        0.0356000        0.0448000   453.6645
## 2  R114           BL6NCrl        0.0355000        0.0446000  -872.7919
## 3  R124           BL6NHsd        0.0352000        0.0447000  1156.0206
## 4  R134 BL6NTyrCBrdCrlCrl        0.0354000        0.0448231  1448.9304
## 5  R144        BL6JBomTac        0.0358362        0.0447000  -540.5402
## 6    R2              BL6J        0.0350000        0.0444000   486.1765
## 7   R22         BL10ScSnJ        0.0365000        0.0441000  -162.1085
## 8   R32           BL10SnJ        0.0353000        0.0423000  -291.9592
## 9   R42          BL10ScCr        0.0349000        0.0421000   177.5475
## 10  R52             BL6NJ        0.0352000        0.0443000 -1811.1508
## 11  R62             BL10J        0.0358000        0.0431000    79.7868
## 12  R72            BL6ByJ        0.0355000        0.0447000  -137.8392
## 13  R84           BL6JEiJ        0.0357000        0.0448000   -97.5968
## 14  R94        BL10ScNHsd        0.0357000        0.0431000   111.8450
##       mu_minor
## 1   36.9628491
## 2  -28.1580321
## 3   -4.8286223
## 4   67.7851609
## 5   30.1363125
## 6  -20.5724470
## 7    0.8326899
## 8  -11.1769438
## 9  -20.9461098
## 10 -54.4881282
## 11  46.9015225
## 12 -11.4329535
## 13 -20.4892330
## 14 -10.1253442
ggplot(data = variability, aes(x = mu_major, y = error_rate_major)) + 
  geom_point() +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

ggplot(data = variability, aes(x = mu_minor, y = error_rate_minor)) + 
  geom_point() +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

Test for phylogenetic signal

data_subset <- data_sorted_common[,5]
names(data_subset) <- rownames(data_sorted_common)

test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)


data_subset <- data_sorted_common[,27]
names(data_subset) <- rownames(data_sorted_common)

test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
test_phlosig$P
## [1] 0.001
phylosig_test <- c()
for (i in 1:ncol(legit_data)) {
  subset <- legit_data[,i]
  names(subset) <- rownames(legit_data)
  test_phlosig <- phylosig(mouse_tree, subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
  phylosig_test <- c(phylosig_test, test_phlosig$P)
  
}

possibilities <- which(phylosig_test < 0.05)

filtered_possibilities <- c()
for (i in possibilities) {
  prop <- min(legit_data[,i])/max(legit_data[,i])
  diff <- max(legit_data[,i]) - min(legit_data[,i])
  if (prop < 0.3 & diff > 15){
    filtered_possibilities <- c(filtered_possibilities, i)
  }
}

filtered_possibilities # this is pretty good. But for some reason 80 isn't showing up
## [1]  27  71  72 104 349 427
# the ones in the table:
which(colnames(legit_data)=="AAAGACAGACAG") #27
## [1] 27
which(colnames(legit_data)=="AAAGACAG") #104
## [1] 104
which(colnames(legit_data)=="AGCATGG") # 427
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG") #80
## [1] 80
which(colnames(legit_data)=="AGGGC") #71
## [1] 71
which(colnames(legit_data)=="AAGAAGAAGAGG") #72
## [1] 72
min(legit_data[,27])
## [1] 3.901369
# check all these

phylosig_test[1:10]
##  [1] 0.139 0.645 0.385 0.460 0.636 0.124 0.511 0.167 0.585 0.505
phylosig_test[80] # why not significant? this one looks legit.
## [1] 0.445
colnames(legit_data)[15]
## [1] "ACAG"
check  <- legit_data[,15] 
names(check) <- rownames(legit_data)
check
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        1623.5399        1521.7861        1250.1318        2013.6151 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##        1680.3956        1627.5711        1148.3129        1168.5650 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         685.9261        1637.6058        1138.5821        1530.7700 
##           B6JEiJ        B10ScNHsd 
##        1546.1709        1123.8266
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

colnames(legit_data)[71]
## [1] "AGGGC"
check_2  <- legit_data[,71]
names(check_2) <- rownames(legit_data)
check_2
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         56.22773         77.83892         82.72548         86.91108 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         51.54837         53.20102         84.08393        189.24518 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##        107.44587         88.20542         99.42593         64.28542 
##           B6JEiJ        B10ScNHsd 
##         57.50664         86.89554
obj<-contMap(mouse_tree,check_2,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

colnames(legit_data)[72]
## [1] "AAGAAGAAGAGG"
check_3  <- legit_data[,72]
names(check_3) <- rownames(legit_data)

obj<-contMap(mouse_tree,check_3,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

colnames(legit_data)[80]
## [1] "AAAGAAAGG"
check  <- legit_data[,80]
names(check) <- rownames(legit_data)

obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

#check

# want to apply some sort of filtering - requiring a difference of x%?
colnames(legit_data)[349]
## [1] "AAAGAAAGGAAGGAAGG"
check  <- legit_data[,349]
names(check) <- rownames(legit_data)

obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# this one is good
check
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        7.6277800        9.2428867        2.3568429        8.5065947 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##       13.2646907       17.5286549        9.6558086       11.3011399 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##        2.5862785        9.5971337        8.6801232        0.1576554 
##           B6JEiJ        B10ScNHsd 
##       14.2165408        9.8961555

Now, look at the Ymin satellite

Ymin <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/Ymin_cn.csv", header=T)
samples <- as.vector(Ymin$Sample)
Ymin <- as.matrix(Ymin$cn_HOR_Ymin)
rownames(Ymin) <- samples
Ymin
##                      [,1]
## B6JEiJ           28.63662
## B10ScNHsd        28.77115
## B6NHsd           21.89659
## B6NTac           28.92109
## B6NJ             20.58688
## B6NTyrCBrdCrlCrl 30.30322
## B6JBomTac        29.80160
## B6J              26.39369
## B10ScCr          20.03651
## B10SnJ           26.36346
## B6ByJ            27.44637
## B6NCrl           28.15001
## B10ScSnJ         25.73320
## B10J             28.53049
colnames(Ymin) <- "cn_HOR_Ymin"

# visualize:
range(Ymin[,1])
## [1] 20.03651 30.30322
# first, major
obj<-contMap(mouse_tree,Ymin[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

anc_states <- fastAnc(mouse_tree, Ymin[,1], vars=T, CI=T)
anc_states 
## Ancestral character estimates using fastAnc:
##       15       16       17       18       19       20       21       22 
## 26.44703 25.73685 25.63808 25.49412 27.32168 27.15721 26.81652 26.57562 
##       23       24       25       26       27 
## 25.92030 26.34371 25.85353 27.87069 27.97749 
## 
## Variances on ancestral states:
##        15        16        17        18        19        20        21 
## 23.326382  7.004077  6.038968  5.635962  4.719127  6.450813  4.497748 
##        22        23        24        25        26        27 
##  3.821273  3.217929  2.740402  2.209050  4.768587  3.887776 
## 
## Lower & upper 95% CIs:
##       lower    upper
## 15 16.98074 35.91332
## 16 20.54967 30.92403
## 17 20.82152 30.45465
## 18 20.84104 30.14719
## 19 23.06386 31.57950
## 20 22.17911 32.13530
## 21 22.65978 30.97327
## 22 22.74420 30.40705
## 23 22.40433 29.43626
## 24 23.09909 29.58832
## 25 22.94041 28.76665
## 26 23.59062 32.15076
## 27 24.11287 31.84211
mu_Ymin <- calc_mutation_rates(Ymin)
#mu_complex
mu_Ymin
##                    cn_HOR_Ymin
## B6JEiJ            0.0045591078
## B10ScNHsd         0.0105033902
## B6NHsd           -0.0291572766
## B6NTac            0.0250231070
## B6NJ             -0.0822913756
## B6NTyrCBrdCrlCrl  0.0695263271
## B6JBomTac         0.0157250869
## B6J              -0.0136534962
## B10ScCr          -0.0234232196
## B10SnJ            0.0021313171
## B6ByJ             0.0030427341
## B6NCrl            0.0088948335
## B10ScSnJ          0.0003549335
## B10J              0.0087594744
mean(mu_Ymin)
## [1] -3.611772e-07
mu_Ymin_norm <- calc_norm_mu(Ymin, mu_Ymin)

mu_Ymin_norm
##                    cn_HOR_Ymin
## B6JEiJ            1.635807e-04
## B10ScNHsd         3.844343e-04
## B6NHsd           -1.124882e-03
## B6NTac            9.498703e-04
## B6NJ             -3.182984e-03
## B6NTyrCBrdCrlCrl  2.689239e-03
## B6JBomTac         5.620620e-04
## B6J              -4.880171e-04
## B10ScCr          -9.187696e-04
## B10SnJ            8.281188e-05
## B6ByJ             1.134649e-04
## B6NCrl            3.346990e-04
## B10ScSnJ          1.384399e-05
## B10J              3.206053e-04
# absolute mutation rate

mu_Ymin_abs <- calc_mutation_rates_abs(Ymin)
mean(mu_Ymin_abs)
## [1] 0.02121755
range(mu_Ymin_abs)
## [1] 0.0003549335 0.0822913756
mu_Ymin_abs_norm <- calc_norm_mu(Ymin, mu_Ymin_abs)
mean(mu_Ymin_abs_norm)
## [1] 0.0008092332
mu_complex_norm_all <- cbind(mu_complex_norm, mu_Ymin_norm)

hypermutator_df$type <- "simple"
head(hypermutator_df)
##         line_names kmer_names          abun   type
## 1           B6NTac         AC -1.665755e-04 simple
## 2           B6NCrl         AC -1.037055e-04 simple
## 3           B6NHsd         AC -1.520792e-04 simple
## 4 B6NTyrCBrdCrlCrl         AC  4.070876e-05 simple
## 5        B6JBomTac         AC -1.399663e-04 simple
## 6              B6J         AC  4.700042e-04 simple
abun_complex <- c()
for (j in 1:ncol(mu_complex_norm_all)) {
  for (i in 1:nrow(mu_complex_norm_all)) {
    abun_complex <- c(abun_complex, mu_complex_norm_all[i,j])
  }
}

line_names <- c(rep(rownames(mu_complex_norm_all), times = ncol(mu_complex_norm_all) ))

kmer_names <- c()
for (i in 1:ncol(mu_complex_norm_all)) {
  kmer_names <- c(kmer_names, c(rep(colnames(mu_complex_norm_all)[i], times = nrow(mu_complex_norm_all))))
}

#now make a dataframe

complex_df <- data.frame(line_names, kmer_names, abun_complex)
complex_df$type <- "complex"
head(complex_df)
##         line_names kmer_names  abun_complex    type
## 1           B6JEiJ   cn_major -0.0000916069 complex
## 2        B10ScNHsd   cn_major  0.0001069893 complex
## 3           B6NHsd   cn_major  0.0009983401 complex
## 4           B6NTac   cn_major  0.0003907011 complex
## 5             B6NJ   cn_major -0.0015786951 complex
## 6 B6NTyrCBrdCrlCrl   cn_major  0.0012629646 complex
colnames(complex_df) <- c("line_names", "kmer_names", "abun", "type")
colnames(hypermutator_df) <- c("line_names", "kmer_names", "abun", "type")

perline_boxplot_df <- rbind(hypermutator_df, complex_df)

#perline_boxplot_df$line_names <- factor(perline_boxplot_df$line_names, levels = c(""))
## make the plot including mu_complex
ggplot(data = perline_boxplot_df, aes(x = line_names, y = abun)) + 
  geom_boxplot() +
  geom_jitter(aes(color=type, alpha=0.6)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

library(corrplot)

mu_complex_norm2 <- cbind(mu_complex_norm, mu_Ymin_norm)


combined_mu_norm <- cbind(mu_complex_norm2, mu_common_norm)
combined_mu_norm[,1:10]
##                       cn_major      cn_minor   cn_HOR_Ymin            AC
## B6JEiJ           -9.160690e-05 -1.615809e-04  1.635807e-04 -1.665755e-04
## B10ScNHsd         1.069893e-04 -7.387808e-05  3.844343e-04 -1.037055e-04
## B6NHsd            9.983401e-04 -3.696561e-05 -1.124882e-03 -1.520792e-04
## B6NTac            3.907011e-04  2.792860e-04  9.498703e-04  4.070876e-05
## B6NJ             -1.578695e-03 -4.101223e-04 -3.182984e-03 -1.399663e-04
## B6NTyrCBrdCrlCrl  1.262965e-03  5.102066e-04  2.689239e-03  4.700042e-04
## B6JBomTac        -5.087033e-04  2.367385e-04  5.620620e-04  1.526023e-04
## B6J               4.575415e-04 -1.616087e-04 -4.880171e-04  9.140961e-05
## B10ScCr           1.728454e-04 -1.568226e-04 -9.187696e-04  2.490011e-04
## B10SnJ           -2.893223e-04 -8.429768e-05  8.281188e-05  4.230222e-04
## B6ByJ            -1.255219e-04 -8.906739e-05  1.134649e-04 -3.731076e-04
## B6NCrl           -7.866809e-04 -2.184930e-04  3.346990e-04 -1.704128e-04
## B10ScSnJ         -1.597972e-04  6.259891e-06  1.384399e-05 -2.642960e-04
## B10J              7.632285e-05  3.422101e-04  3.206053e-04 -9.510261e-05
##                             AG           AAG        AACCCT          AGAT
## B6JEiJ            4.075430e-04 -0.0016936235 -0.0004374261 -0.0036641401
## B10ScNHsd         5.307961e-04  0.0010510781 -0.0008913244  0.0015920712
## B6NHsd           -1.564207e-03 -0.0033459144 -0.0009501471 -0.0043442036
## B6NTac            1.091716e-03  0.0012808578 -0.0011192028  0.0009625905
## B6NJ              4.652882e-04  0.0004670982 -0.0021052905  0.0017003030
## B6NTyrCBrdCrlCrl -4.409409e-04 -0.0009093677  0.0033453313 -0.0017989410
## B6JBomTac         1.838519e-04  0.0005484810  0.0011256591  0.0001359405
## B6J               4.974917e-05  0.0005808924  0.0004389475  0.0012644830
## B10ScCr          -7.222850e-04 -0.0020978667  0.0001943518 -0.0025471971
## B10SnJ           -5.365049e-04  0.0016646539  0.0027878050  0.0037813664
## B6ByJ             3.595034e-04  0.0007397267 -0.0014161398  0.0005660927
## B6NCrl            9.820323e-05  0.0005409763  0.0001317758  0.0005682900
## B10ScSnJ         -6.374373e-05  0.0002449262 -0.0012163915 -0.0002705450
## B10J              6.792768e-05  0.0002107909 -0.0005693525  0.0004712985
##                         AGAGGC          AAAG
## B6JEiJ           -3.605975e-04 -1.514244e-03
## B10ScNHsd        -3.000287e-04  1.043085e-03
## B6NHsd            2.057965e-06 -2.464712e-03
## B6NTac            2.326053e-04  1.582728e-03
## B6NJ             -2.519822e-05  8.803666e-04
## B6NTyrCBrdCrlCrl  2.586653e-04 -1.402117e-03
## B6JBomTac         2.273234e-04 -1.279014e-06
## B6J               9.134611e-05  4.982097e-04
## B10ScCr           2.915746e-04 -1.573901e-03
## B10SnJ            5.069733e-04  5.719602e-04
## B6ByJ            -4.350530e-04  4.967773e-04
## B6NCrl           -1.447509e-04  6.199326e-04
## B10ScSnJ         -2.517623e-04  3.594212e-04
## B10J             -1.601563e-04  4.671663e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)